#install.packages(c("forecast", "expsmooth", "seasonal"))
library(TTR)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(tseries)
library(expsmooth)
library(fpp2)
## ── Attaching packages ────────────────────────────────────────────── fpp2 2.5 ──
## ✔ ggplot2 3.4.2 ✔ fma 2.5
##
library(seasonal)
library(MASS)
##
## Attaching package: 'MASS'
## The following objects are masked from 'package:fma':
##
## cement, housing, petrol
library(stats)
library(TSA)
## Registered S3 methods overwritten by 'TSA':
## method from
## fitted.Arima forecast
## plot.Arima forecast
##
## Attaching package: 'TSA'
## The following objects are masked from 'package:stats':
##
## acf, arima
## The following object is masked from 'package:utils':
##
## tar
library(forecast)
library(ggplot2)
library(tseries)
library(imputeTS)
##
## Attaching package: 'imputeTS'
## The following object is masked from 'package:tseries':
##
## na.remove
library(vars)
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following object is masked from 'package:imputeTS':
##
## na.locf
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: urca
## Loading required package: lmtest
library(timetk)
library(Metrics)
##
## Attaching package: 'Metrics'
## The following object is masked from 'package:forecast':
##
## accuracy
library(lmtest)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(xts)
library(dplyr)
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:xts':
##
## first, last
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
path <- "/Users/aashishsingh/Desktop/Time Series - Final Project/"
pm2_5_data <- read.csv(paste0(path, "la_pm2_5_pollutant_data.csv"),
na.strings=c("", "NA"))
ozone_data <- read.csv(paste0(path, "la_ozone_pollutant_data.csv"),
na.strings=c("", "NA"))
# Convert data into ts objects
pm2_5_ts <- ts(pm2_5_data$PM2.5.AQI.Value, start = c(1999,1,1), frequency = 365.25)
ozone_ts <- ts(ozone_data$Ozone.AQI.Value, start = c(1999,1,1), frequency = 365.25)
plot(pm2_5_ts, main="Los Angeles PM 2.5 AQI")
plot(ozone_ts, main="Los Angeles Ozone AQI")
kpss.test(pm2_5_ts)
## Warning in kpss.test(pm2_5_ts): p-value smaller than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: pm2_5_ts
## KPSS Level = 6.951, Truncation lag parameter = 12, p-value = 0.01
# The reported p-value is 0.01, which is smaller than 0.05, and would suggest
# that we reject the null hypothesis of level stationarity and conclude that
# there is evidence that the data is non-stationary
adf.test(pm2_5_ts)
## Warning in adf.test(pm2_5_ts): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: pm2_5_ts
## Dickey-Fuller = -14.554, Lag order = 20, p-value = 0.01
## alternative hypothesis: stationary
# Similarly the Augumented Dickey-Fuller test results in p-value of 0.01 (<0.05)
# where we do reject the null hypothesis that the time series is non-stationary
# and thus data is stationary.
# These are opposite results so we use ACF PACF plots to see if the series is stationary
acf(pm2_5_ts, main = "ACF of Original Time Series")
pacf(pm2_5_ts, main = "PACF of Original Time Series")
#### Step 3: Prepare data for Weekly & Monthly extraction and EDA
# Create data with Dates
pm2_5_df <- as.data.frame(pm2_5_ts)
pm2_5_df$Date <- mdy(pm2_5_data$Date)
colnames(pm2_5_df) <- c("PM2.5.AQI.Value", "Date")
# Convert to xts format for weekly & monthly
pm2_5_xts <- as.xts(pm2_5_df, order.by=pm2_5_df$Date)
pm2_5_xts <- pm2_5_xts[,-2]
# Let's see how seasonality looks over time and is the variance changing
pm2_5_df$Month <- month(pm2_5_df$Date)
pm2_5_df$Year <- year(pm2_5_df$Date)
avg_aqi_by_month_year <- pm2_5_df %>%
group_by(pm2_5_df$Year, pm2_5_df$Month) %>%
summarise(
avg_value = mean(PM2.5.AQI.Value)
)
## `summarise()` has grouped output by 'pm2_5_df$Year'. You can override using the
## `.groups` argument.
colnames(avg_aqi_by_month_year) <- c("Year", "Month", "avg_value")
reshape_avg_aqi_by_month_year <-
sqldf(
"SELECT
Month,
MAX(CASE WHEN Year = 1999 THEN avg_value END) AS Year_1999,
MAX(CASE WHEN Year = 2000 THEN avg_value END) AS Year_2000,
MAX(CASE WHEN Year = 2001 THEN avg_value END) AS Year_2001,
MAX(CASE WHEN Year = 2002 THEN avg_value END) AS Year_2002,
MAX(CASE WHEN Year = 2003 THEN avg_value END) AS Year_2003,
MAX(CASE WHEN Year = 2004 THEN avg_value END) AS Year_2004,
MAX(CASE WHEN Year = 2005 THEN avg_value END) AS Year_2005,
MAX(CASE WHEN Year = 2006 THEN avg_value END) AS Year_2006,
MAX(CASE WHEN Year = 2007 THEN avg_value END) AS Year_2007,
MAX(CASE WHEN Year = 2008 THEN avg_value END) AS Year_2008,
MAX(CASE WHEN Year = 2009 THEN avg_value END) AS Year_2009,
MAX(CASE WHEN Year = 2010 THEN avg_value END) AS Year_2010,
MAX(CASE WHEN Year = 2011 THEN avg_value END) AS Year_2011,
MAX(CASE WHEN Year = 2012 THEN avg_value END) AS Year_2012,
MAX(CASE WHEN Year = 2013 THEN avg_value END) AS Year_2013,
MAX(CASE WHEN Year = 2014 THEN avg_value END) AS Year_2014,
MAX(CASE WHEN Year = 2015 THEN avg_value END) AS Year_2015,
MAX(CASE WHEN Year = 2016 THEN avg_value END) AS Year_2016,
MAX(CASE WHEN Year = 2017 THEN avg_value END) AS Year_2017,
MAX(CASE WHEN Year = 2018 THEN avg_value END) AS Year_2018,
MAX(CASE WHEN Year = 2019 THEN avg_value END) AS Year_2019,
MAX(CASE WHEN Year = 2020 THEN avg_value END) AS Year_2020,
MAX(CASE WHEN Year = 2021 THEN avg_value END) AS Year_2021,
MAX(CASE WHEN Year = 2022 THEN avg_value END) AS Year_2022,
MAX(CASE WHEN Year = 2023 THEN avg_value END) AS Year_2023
FROM avg_aqi_by_month_year
GROUP BY Month
ORDER BY Month"
)
colnames(reshape_avg_aqi_by_month_year) <- c(
"Month", "1999", "2000", "2001", "2002", "2003", "2004", "2005", "2006",
"2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015",
"2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023")
boxplot(reshape_avg_aqi_by_month_year[2:26])
# We can see that over the years there is downward trend and variance is decreasing
# except 2020 when we see a high peak likely caused by wildfires
# Link: https://en.wikipedia.org/wiki/Lake_Fire_(2020)
pm2_5_weekly <- apply.weekly(pm2_5_xts, mean)
pm2_5_weekly_ts <- ts(pm2_5_weekly["19990103/20230811"],
start = c(1999,1),
frequency = 52.18)
plot(pm2_5_weekly_ts)
# Strong seasonality, Strong cyclic, light downward trend, variance is reducing
kpss.test(pm2_5_weekly_ts)
## Warning in kpss.test(pm2_5_weekly_ts): p-value smaller than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: pm2_5_weekly_ts
## KPSS Level = 3.4904, Truncation lag parameter = 7, p-value = 0.01
adf.test(pm2_5_weekly_ts)
## Warning in adf.test(pm2_5_weekly_ts): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: pm2_5_weekly_ts
## Dickey-Fuller = -8.4636, Lag order = 10, p-value = 0.01
## alternative hypothesis: stationary
acf(pm2_5_weekly_ts, main = "ACF of Weekly Time Series")
pacf(pm2_5_weekly_ts, main = "PACF of Weekly Time Series")
# Split the data into a training and test dataset
train_weekly <- window(pm2_5_weekly_ts, start = c(1999,1), end=c(2020,52))
test_weekly <- window(pm2_5_weekly_ts, start = c(2021,1))
# Looking at spectral analysis
periodogram(train_weekly, log = "no", plot=TRUE,
ylab = "Periodogram",
xlab = "Frequency",
lwd=2, xlim = c(0, 0.06))
# There are two trends:
# 1) A typical yearly (52 weeks) seasonal trend
# 2) A trend that is repeating every 9.6 years (500 weeks)
# 3) A typical half yearly (26 weeks) seasonal trend
# Overall, there is no mixed effect that normal seasonality model cannot capture.
# Decompose the series
plot(decompose(train_weekly, type="multiplicative"))
# Important Inferences
# 1) The PM2.5 AQI has been decreasing overall though there are rise every now and then.
# However the trend is going down with time.
# 2) Winter months have a strong seasonal effect with Nov and Dec being the peak months
# usually which likely could be due to cold temperatures causing pollutant to
# not escape the lower atmosphere.
# Link: https://www.accuweather.com/en/health-wellness/why-air-pollution-is-worse-in-winter/689434#:~:text=Cold%20air%20is%20denser%20and%20moves%20slower%20than%20warm%20air,rate%20than%20during%20the%20summer.
# 3) We can see a seasonal cycle of 12 months where the mean value of each month starts
# with a increasing trend in the beginning of the year and drops down towards
# the end of the year. We can see a seasonal effect with a cycle of 12 months.
# Understand the seasonality and remove it to see the trend
train_weekly_diff <- diff(train_weekly, lag = 52)
autoplot(train_weekly_diff, ylab="Train Seasonal Differencing - Weekly Data")
# Let's look if the series is stationary?
kpss.test(train_weekly_diff)
## Warning in kpss.test(train_weekly_diff): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: train_weekly_diff
## KPSS Level = 0.17702, Truncation lag parameter = 7, p-value = 0.1
# The reported p-value is 0.1, which is > 0.05, and would suggest that we fail
# to reject the null hypothesis of level stationarity (conclusion: stationary)
adf.test(train_weekly_diff)
## Warning in adf.test(train_weekly_diff): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: train_weekly_diff
## Dickey-Fuller = -8.5534, Lag order = 10, p-value = 0.01
## alternative hypothesis: stationary
# The reported p-value of 0.01 (<0.05) so we do reject the null hypothesis that
# the time series is non-stationary (conclusion: stationary)
acf(train_weekly_diff, main = "ACF of Seasonal Differencing Time Series - Weekly Data")
pacf(train_weekly_diff, main = "PACF of Seasonal Differencing Time Series - Weekly Data")
# Forecast the seasonal naïve for (2021-01 to 2023-07)
forecast_snaive_weekly <- snaive(train_weekly, h=110)
## Warning in lag.default(y, -lag): 'k' is not an integer
# Plot the forecasts for snaive model
plot(forecast_snaive_weekly, main = "PM 2.5 AQI Forecast - SNaive (Weekly)",
xlab = "Week", ylab = "PM 2.5 AQI")
lines(test_weekly)
# Compare the forecast with the actual test data by calculating MAPE and MSE
# Mean Absolute Percentage Error (MAPE)
mape_snaive_weekly <- mean(abs((test_weekly - forecast_snaive_weekly$mean)/test_weekly))
mape_snaive_weekly
## [1] 0.2749545
# Mean Absolute Error (MAE)
mae_snaive_weekly <- mean(abs((test_weekly - forecast_snaive_weekly$mean)))
mae_snaive_weekly
## [1] 16.28571
# Mean Squared Error (MSE)
mse_snaive_weekly <- mean((test_weekly - forecast_snaive_weekly$mean)^2)
mse_snaive_weekly
## [1] 599.6278
# Evaluate the residuals
checkresiduals(forecast_snaive_weekly)
##
## Ljung-Box test
##
## data: Residuals from Seasonal naive method
## Q* = 509.98, df = 104.36, p-value < 2.2e-16
##
## Model df: 0. Total lags used: 104.36
# Create monthly data
pm2_5_monthly <- apply.monthly(pm2_5_xts, mean)
pm2_5_monthly_ts <- ts(pm2_5_monthly["19990131/20230811"], start = c(1999,1), frequency = 12)
plot(pm2_5_monthly_ts)
kpss.test(pm2_5_monthly_ts)
## Warning in kpss.test(pm2_5_monthly_ts): p-value smaller than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: pm2_5_monthly_ts
## KPSS Level = 1.9774, Truncation lag parameter = 5, p-value = 0.01
adf.test(pm2_5_monthly_ts)
## Warning in adf.test(pm2_5_monthly_ts): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: pm2_5_monthly_ts
## Dickey-Fuller = -7.4466, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
acf(pm2_5_monthly_ts, main = "ACF of Monthly Time Series")
pacf(pm2_5_monthly_ts, main = "PACF of Monthly Time Series")
# Split the data into a training and test dataset
train_monthly <- window(pm2_5_monthly_ts, start = c(1999,1), end=c(2020,12))
test_monthly <- window(pm2_5_monthly_ts, start = c(2021,1))
# Looking at spectral analysis
periodogram(train_monthly, log = "no", plot=TRUE,
ylab = "Periodogram",
xlab = "Frequency",
lwd=2, xlim = c(0, 0.2))
# There are two trends:
# 1) A typical yearly (12 months) seasonal trend
# 2) A trend that is repeating every 8-9 years (100 months)
# 3) A typical half yearly (6 months) seasonal trend
# Overall, there is no mixed effect that normal seasonality model cannot capture.
# Decompose the series
plot(decompose(train_monthly, type="multiplicative"))
# Important Inferences
# 1) The PM2.5 AQI has been decreasing overall though there are rise every now and then.
# However the trend is going down with time.
# 2) Winter months have a strong seasonal effect with Nov and Dec being the peak months
# usually which likely could be due to cold temperatures causing pollutant to
# not escape the lower atmosphere.
# Link: https://www.accuweather.com/en/health-wellness/why-air-pollution-is-worse-in-winter/689434#:~:text=Cold%20air%20is%20denser%20and%20moves%20slower%20than%20warm%20air,rate%20than%20during%20the%20summer.
# 3) We can see a seasonal cycle of 12 months where the mean value of each month starts
# with a increasing trend in the beginning of the year and drops down towards
# the end of the year. We can see a seasonal effect with a cycle of 12 months.
# Understand the seasonality and remove it to see the trend
train_monthly_diff <- diff(train_monthly, lag = 12)
autoplot(train_monthly_diff, ylab="Train Seasonal Differencing (Monthly)")
# Let's look if the series is stationary?
kpss.test(train_monthly_diff)
## Warning in kpss.test(train_monthly_diff): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: train_monthly_diff
## KPSS Level = 0.14573, Truncation lag parameter = 5, p-value = 0.1
# The reported p-value is 0.1, which is > 0.05, and would suggest that we fail
# to reject the null hypothesis of level stationarity (conclusion: stationary)
adf.test(train_monthly_diff)
## Warning in adf.test(train_monthly_diff): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: train_monthly_diff
## Dickey-Fuller = -5.1056, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
# The reported p-value of 0.01 (<0.05) so we do reject the null hypothesis that
# the time series is non-stationary (conclusion: stationary)
acf(train_monthly_diff, main = "ACF of Seasonal Differencing Time Series")
pacf(train_monthly_diff, main = "PACF of Seasonal Differencing Time Series")
# Forecast the seasonal naïve for (2021-01 to 2023-07)
forecast_snaive_monthly <- snaive(train_monthly, h=31)
# Plot the forecasts for snaive model
plot(forecast_snaive_monthly, main = "PM 2.5 AQI Forecast - SNaive",
xlab = "Week", ylab = "PM 2.5 AQI")
lines(test_monthly)
# Compare the forecast with the actual test data by calculating MAPE and MSE
# Mean Absolute Percentage Error (MAPE)
mape_snaive <- mean(abs((test_monthly - forecast_snaive_monthly$mean)/test_monthly))
mape_snaive
## [1] 0.20778
# Mean Squared Error (MSE)
mse_snaive <- mean((test_monthly - forecast_snaive_monthly$mean)^2)
mse_snaive
## [1] 282.8901
# Evaluate the residuals
checkresiduals(forecast_snaive_monthly)
##
## Ljung-Box test
##
## data: Residuals from Seasonal naive method
## Q* = 82.387, df = 24, p-value = 2.523e-08
##
## Model df: 0. Total lags used: 24